mask_siteid_sampling <- site_protocol_quanti[
  site_protocol_quanti$variable == "year" &
    site_protocol_quanti$n >= 10,
  ]$siteid

mask_siteid_protocol <- site_protocol_quali[
  site_protocol_quali$unitabundance %in% c("Count", "Ind.100m2"), ]$siteid

mask_siteid <- mask_siteid_sampling[
    mask_siteid_sampling %in% mask_siteid_protocol]
dir_plot <- here("doc", "fig", "raw_data")
if (!dir.exists(dir_plot)) {
  dir.create(dir_plot)
}

abun_rich_nested <- filtered_dataset$abun_rich_op %>%
  nest_by(siteid)

library(furrr)
plan(multisession, workers = 3)
future_walk2(abun_rich_nested$data, abun_rich_nested$siteid,
  function (x, y, ...) {

    png(paste0(dir_plot, "/species_nb_site_", y, ".png"), width = 500, height = 500*1/1.6)

    p <- plot_community_data(
        dataset = x, y = "species_nb", x = "year", title = y)

    print(p)

    dev.off()
  })

future_walk2(abun_rich_nested$data, abun_rich_nested$siteid,
  function (x, y, ...) {

    png(paste0(dir_plot, "/species_nb_site_", y, ".png"), width = 500, height = 500*1/1.6)

    p <- plot_community_data(
      dataset = x, y = "total_abundance", x = "year", title = y)

    print(p)

    dev.off()
  })

1 Maps

library(mapview)
library(leafpop)
tar_load(filtered_dataset)
loc <- filtered_dataset$location %>%
  left_join(filtered_dataset$site_quali, by = "siteid") %>%
  st_as_sf(coords = c("longitude", "latitude"),
  crs = 4326)
site_richness <- filtered_dataset$measurement %>%
  group_by(siteid) %>%
  summarise(tot_nb_species = length(unique(species)))

op_richness_summary <-
  filtered_dataset$abun_rich_op %>%
  group_by(siteid) %>%
  summarise(enframe(summary_distribution(species_nb)), .groups = "drop") %>%
  pivot_wider(names_from = "name", values_from = "value") %>%
  rename(median_richness = median)
var_map_view <- c("siteid", "protocol", "unitabundance", "unitbiomass", "min",
  "max", "completeness", "tot_nb_richness", "median_richness")
loc2 <- loc %>%
  left_join(op_richness_summary, by = "siteid") %>%
  select(any_of(var_map_view)) %>%
  left_join(site_richness, by = "siteid") %>%
  select(any_of(var_map_view))
mapView(loc2, zcol = "protocol")
get_file_plot_in_tbl <- function(
  directory = NULL,
  str_file_to_match = NULL,
  regex_pattern = NULL) {

  file_plot_site <- list.files(directory, full.names = TRUE)
  filtered_file_plot_site <- file_plot_site[
    str_detect(file_plot_site, str_file_to_match)]

  names(filtered_file_plot_site) <- str_extract(filtered_file_plot_site,
    regex_pattern)
  filtered_file_plot_site <- enframe(
    filtered_file_plot_site,
    name = "siteid",
    value = "file"
  )

}

abun_file_plot_site_tbl <- get_file_plot_in_tbl(
  directory = "fig/raw_data",
  str_file_to_match = "tot_abun",
  regex_pattern = "S\\d+"

)

richness_file_plot_site_tbl <- get_file_plot_in_tbl(
  directory = dir_plot,
  str_file_to_match = "species_nb",
  regex_pattern = "S\\d+") %>%
  rename(richness = file)
loc2 <- loc %>%
  select(siteid, protocol) %>%
  left_join(abun_file_plot_site_tbl, by = "siteid") %>%
  left_join(richness_file_plot_site_tbl, by = "siteid")
m <- mapView(loc2, zcol = "protocol",
  popup = popupImage(loc2$file)
)
mapshot(m, url = "map_abundance.html",
  selfcontained = FALSE,
)
m2 <- mapView(loc2, zcol = "protocol",
  popup = popupImage(loc2$richness)
)
mapshot(m2, url = "map_richness.html",
  selfcontained = FALSE
)
trends_data <- abun_rich_op %>%
  left_join(op_protocol, by = "op_id") %>%
  filter(siteid %in% mask_siteid) %>%
  mutate(
    log_total_abundance = log(total_abundance),
    log_species_nb = log(species_nb)
  )
plot_trends <- trends_data %>%
  group_by(siteid) %>%
  nest() %>%
  ungroup() %>%
  slice_sample(n = 100) %>%
  mutate(
    p_abun = map2(data, siteid,
      ~plot_community_data(
        dataset = .x, y = "total_abundance", x = "year", title = .y)),
    p_rich = map2(data, siteid,
      ~plot_community_data(
        dataset = .x, y = "species_nb", x = "year", title = .y),
    )
  )

1.1 Total abundance

n_plot_by_batch <- 8
map(
  split(
    seq_len(nrow(plot_trends)),
    1:floor(nrow(plot_trends) / n_plot_by_batch) + 1),
  ~plot_grid(plotlist = plot_trends[.x, ]$p_abun)
  )
#> Warning in split.default(seq_len(nrow(plot_trends)), 1:floor(nrow(plot_trends)/
#> n_plot_by_batch) + : data length is not a multiple of split variable
#> Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
#> parametric, : pseudoinverse used at 2007
#> Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
#> parametric, : neighborhood radius 2
#> Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
#> parametric, : reciprocal condition number 0
#> Warning in predLoess(object$y, object$x, newx = if
#> (is.null(newdata)) object$x else if (is.data.frame(newdata))
#> as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
#> 2007
#> Warning in predLoess(object$y, object$x, newx = if
#> (is.null(newdata)) object$x else if (is.data.frame(newdata))
#> as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius 2
#> Warning in predLoess(object$y, object$x, newx = if
#> (is.null(newdata)) object$x else if (is.data.frame(newdata))
#> as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
#> number 0
#> $`2`

#> 
#> $`3`

#> 
#> $`4`

#> 
#> $`5`

#> 
#> $`6`

#> 
#> $`7`

#> 
#> $`8`

#> 
#> $`9`

#> 
#> $`10`

#> 
#> $`11`

#> 
#> $`12`

#> 
#> $`13`

1.2 Species richness

map(
  split(
    seq_len(nrow(plot_trends)),
    1:floor(nrow(plot_trends) / n_plot_by_batch) + 1
    ),
  ~plot_grid(plotlist = plot_trends[.x, ]$p_rich)
  )
#> Warning in split.default(seq_len(nrow(plot_trends)), 1:floor(nrow(plot_trends)/
#> n_plot_by_batch) + : data length is not a multiple of split variable
#> Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
#> parametric, : pseudoinverse used at 2007
#> Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
#> parametric, : neighborhood radius 2
#> Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
#> parametric, : reciprocal condition number 0
#> Warning in predLoess(object$y, object$x, newx = if
#> (is.null(newdata)) object$x else if (is.data.frame(newdata))
#> as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
#> 2007
#> Warning in predLoess(object$y, object$x, newx = if
#> (is.null(newdata)) object$x else if (is.data.frame(newdata))
#> as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius 2
#> Warning in predLoess(object$y, object$x, newx = if
#> (is.null(newdata)) object$x else if (is.data.frame(newdata))
#> as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
#> number 0
#> $`2`

#> 
#> $`3`

#> 
#> $`4`

#> 
#> $`5`

#> 
#> $`6`

#> 
#> $`7`

#> 
#> $`8`

#> 
#> $`9`

#> 
#> $`10`

#> 
#> $`11`

#> 
#> $`12`

#> 
#> $`13`

1.3

tar_load(toy_dataset)
unique(toy_dataset$siteid)
#> [1] "S8633"  "S11138" "S534"   "S529"   "S11219"
plot_temporal_biomass <- function (bm_data = NULL, biomass_var = NULL, com = NULL, .log = FALSE) {

  #main_title <- paste0("Stab = ", round(1/(sync$cv_com), 2),", ", "Sync = ",
    #round(sync$synchrony, 2),", ", "CVsp = ", round(sync$cv_sp, 2))
  sym_bm_var <- rlang::sym(biomass_var)
  # Total
  total_biomass <- bm_data %>% 
  group_by(date) %>%
  summarise(!!sym_bm_var := sum(!!sym_bm_var, na.rm = FALSE))
  
  p <- bm_data %>%
    mutate(label = if_else(date == max(date), as.character(species), NA_character_)) %>%
  ggplot(aes_string(x = "date", y = biomass_var, color = "species")) + 
  geom_line() +
  lims(y = c(0, max(total_biomass[[biomass_var]]))) +
  labs(
  #title = main_title, subtitle = paste0("Station: ", station),
    y = "Biomass (g)", x = "Sampling date"
  ) +
  ggrepel::geom_label_repel(aes(label = label),
    size = 2.5, nudge_x = 1, na.rm = TRUE) 
  
  # Add total biomass
  p2 <- p +
    geom_line(data = total_biomass, aes(color = "black", size = 3)) +
    theme(legend.position = "none")

  # Add summary: richness, connectance, stab, t_lvl, sync, cv_sp 
  com %<>%
    mutate_if(is.double, round(., 2))

  label <- paste(
    "S = ", com$bm_std_stab,
    "sync = ", com$sync,
    "CVsp = ", com$cv_sp,
    "R = ", com$rich_tot_std,
    "C = ", com$ct,
    "Tlvl = ", com$t_lvl
  ) 

  p3 <- p2 +
    annotate("text", x = median(total_biomass$date),
      y = 15, label = label)

  if (.log) {
    p3 <- p3 + scale_y_log10() 
  }

  return(p3)
}

ti <- toy_dataset %>%
  filter(siteid == unique(toy_dataset$siteid)[2])
plot_population <- function (dataset = NULL, y_var = NULL, time_var = NULL) {

  sym_y_var <- rlang::sym(y_var)
  sym_time_var <- rlang::sym(time_var)
  # Total
  total_dataset <- dataset %>%
  group_by(!!sym_time_var) %>%
  summarise(!!sym_y_var := sum(!!sym_y_var, na.rm = FALSE))
  
  p <- dataset %>%
    mutate(label = if_else(!!sym_time_var == max(!!sym_time_var), as.character(species), NA_character_)) %>%
  ggplot(aes_string(x = time_var, y = y_var, color = "species")) + 
  geom_line() +
  lims(y = c(0, max(total_dataset[[y_var]]))) +
  labs(
  #title = main_title, subtitle = paste0("Station: ", station),
    y = "Biomass (g)", x = "Sampling time_var"
  ) +
  ggrepel::geom_label_repel(aes(label = label),
    size = 2.5, nudge_x = 1, na.rm = TRUE)
  
  # Add total biomass
  p2 <- p +
    geom_line(data = total_dataset, aes(color = "black", size = 3)) +
    theme(legend.position = "none")
  return(p2)

}

plot_population(dataset = ti, y_var = "abundance", time_var = "year")
#> Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
#> increasing max.overlaps

plot_temporal_population(com = ti, ribbon = FALSE)

p <- plot_temporal_population(com = ti, ribbon = TRUE)

GeomRibbon$handle_na <- function(data, params) {  data }
p$data %>%
  ggplot(
    aes(y = abundance, ymin = ymin, ymax = ymax, x = year,
      fill = species)
    ) +
  geom_ribbon()
set.seed(1)

test <- data.frame(x = rep(1:10, 3), y = abs(rnorm(30)), z = rep(LETTERS[1:3],
    10)) %>% arrange(x, z)

test[test$x == 4, "y"] <- NA

test$ymax <- test$y
test$ymin <- 0
zl <- unique(test$z)
for (i in 2:length(zl)) {
    zi <- test$z == zl[i]
    zi_1 <- test$z == zl[i - 1]
    test$ymin[zi] <- test$ymax[zi_1]
    test$ymax[zi] <- test$ymin[zi] + test$ymax[zi]
}


# fix GeomRibbon
GeomRibbon$handle_na <- function(data, params) {  data }

ggplot(test, aes(x = x, y=y, ymax = ymax, ymin = ymin, fill = z)) +
  geom_ribbon()
  • Be careful to multiple fishing per year
toy_dataset %>%
  group_by(siteid, year, species) %>%
  summarise(test=n()>1) %>%
  filter(test)
pop_trends <- toy_dataset %>%
  filter(!siteid %in% c("S534", "S8633")) %>%
  group_by(siteid) %>%
  nest() %>%
  mutate(
    p_pop = map(data, ~plot_temporal_population(com = .x, ))
  )
#> Note: Using an external vector in selections is ambiguous.
#> i Use `all_of(species_var)` instead of `species_var` to silence this message.
#> i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
#> This message is displayed once per session.
#> Note: Using an external vector in selections is ambiguous.
#> i Use `all_of(y_var)` instead of `y_var` to silence this message.
#> i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
#> This message is displayed once per session.
#> Note: Using an external vector in selections is ambiguous.
#> i Use `all_of(species)` instead of `species` to silence this message.
#> i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
#> This message is displayed once per session.
pop_trends$p_pop
#> [[1]]
#> Warning: Removed 280 rows containing missing values (position_stack).

#> 
#> [[2]]

#> 
#> [[3]]
#> Warning: Removed 104 rows containing missing values (position_stack).

2 Environmental variables

The operation are too time consuming, so I set eval = FALSE from here.

knitr::opts_chunk$set(eval=FALSE)

2.1 River atlas

#

eu_shp <- rvat_shp_files[stringr::str_detect(rvat_shp_files, "eu.shp$")]
eu_shp_layers <- sf::st_layers(eu_shp, do_count = TRUE)

#<!--There are `r length(rvat_shp_files)`. The dataset is gigantic,-->
#<!--`r eu_shp_layers$field` variables...-->
  • Looking at the columns:
tictoc::tic()
riveratlas_slice <- sf::read_sf(eu_shp, query = "SELECT * FROM RiverATLAS_v10_eu WHERE FID = 1")
tictoc::toc()
#313.564 sec elapsed

col_riveratlas <- colnames(riveratlas_slice)
stream_charac <- col_riveratlas[str_detect(col_riveratlas, "[A-Z]")]

# environmental variable begins by three character
env_stream <- str_sub(col_riveratlas[!col_riveratlas %in% stream_charac], 1, 3)
unique_env_type <- unique(env_stream)
tictoc::tic()
riv_atlas <- sf::read_sf(eu_shp, query = "SELECT FID FROM RiverATLAS_v10_eu")
tictoc::toc()
#188.71 sec elapsed

loc <- get_site_location_as_sf()
tar_load(filtered_dataset)
world <- ne_countries(returnclass = "sf") %>%
  select(sov_a3)

loc <- filtered_dataset$location %>%
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326)
# Probleme de coordonées en Suède et en Angleterre
overlap_mask <- st_intersects(loc, world)
overlap_mask <- st_is_within_distance(loc, world, dist = 100)
mask_country_loc <- map_lgl(overlap_mask, any)

plot(st_geometry(world %>% st_crop(st_bbox(loc[!mask_country_loc, "siteid"]))))
plot(st_geometry(loc[!mask_country_loc, "siteid"]), add = TRUE)
mask_world <- map_int(overlap_mask[mask_country_loc], 1) %>% unique
selected_country <- world[mask_world, ]
plot(st_geometry(selected_country))
its_rv_country <- st_intersects(test, selected_country)
mask_river_country <- map_lgl(its_rv_country, any)
sum(mask_river_country) / length(mask_river_country)
map_dbl(list(test[mask_river_country, ], test), object.size) %>% reduce(., `/`)

snapped_pt <- st_snap(
  st_transform(loc, crs = 54008),
  st_transform(test[mask_river_country, ], crs = 54008),
  tolerance = 200
)
  • Match with convex hull
hull_station <- st_convex_hull(st_union(loc))
plot(st_geometry(world))
plot(st_geometry(hull_station), add = TRUE)
its_rv_st <- st_intersects(test, hull_station)
mask_river_atlas <- map_lgl(its_rv_st, any)
sum(mask_river_atlas) / length(mask_river_atlas)
map_dbl(list(test[mask_river_atlas, ], test), object.size) %>% reduce(., `/`) 
?st_nearest_points
mask_country_loc <- map_lgl(overlap_mask, any)
plot(st_geometry(world %>% st_crop(st_bbox(loc[!mask_country_loc, "siteid"]))))
plot(st_geometry(loc[!mask_country_loc, "siteid"]), add = TRUE)
unit(10, units = "cm")
st_bbox(riv_atlas)
loc %>%
  st_crop(st_bbox(riv_atlas))

st_bbox(riv_atlas)
esp <- loc %>%
  filter(country == "ESP")
esp_riv_atlas <- riv_atlas %>%
  st_crop(st_bbox(esp))
  
snapped_pt <- st_snap(
  st_transform(esp, crs = 54008),
  st_transform(esp_riv_atlas, crs = 54008),
  tolerance = 200
)
nearest_snapped_pt <- st_nearest_points(
  st_transform(esp, crs = 54008),
  st_transform(esp_riv_atlas, crs = 54008),
  tolerance = 200
)

plot(st_geometry(esp_riv_atlas))
plot(st_geometry(esp), add = TRUE)
plot(st_geometry(snapped_pt) %>% st_transform(crs = 4326), col = "red", add = TRUE)
plot(st_geometry(esp))
plot(st_geometry(snapped_pt) %>% st_transform(crs = 4326), col = "red", add = TRUE)


plot(st_geometry(esp))
plot(st_geometry(nearest_snapped_pt) %>% st_transform(crs = 4326), col = "red", add = TRUE)
nearest_snapped_pt_sf <- st_snap_points2(
  st_transform(esp, crs = 54008),
  st_transform(esp_riv_atlas, crs = 54008),
  max_dist = 2000,
  return_snapped_pt = TRUE
)


plot(st_geometry(esp_riv_atlas[nearest_snapped_pt, ]))
plot(st_geometry(esp), add = TRUE)
plot(st_geometry(nearest_snapped_pt_sf) %>% st_transform(crs = 4326), col = "red", add = TRUE)
  • Test with problematic sweden:
riv_atlas <- st_transform(riv_atlas, crs = 3152)
swe_site <- loc %>%
  filter(country == "SWE")
swe_site <- st_transform(swe_site, crs = 3152)
#debugonce(st_snap_points)
ti <- match_river_site(
  river = riv_atlas,
  site = swe_site,
  max_dist = 10000,
  metric_crs = NULL,
  return_snapped_pt = FALSE,
  crop = TRUE,
  buffer_dist = NULL,
  col_id_river = "FID",
  site_id = "siteid"
)
dev.new()
plot(st_geometry(riv_atlas[riv_atlas$FID %in% ti$riverid, ]))
plot(st_geometry(swe_site), add = TRUE)
plot(st_geometry(riv_atlas))
plot(st_geometry(swe_site), add = TRUE)
plot(st_geometry(ti) %>% st_transform(crs = 4326), col = "red", add = TRUE)
crop_eu <- loc %>% 
  st_transform(crs = 3152) %>%
  st_crop(st_bbox(riv_atlas)) %>%
  arrange(main_bas)
riv_atlas_loc <- riv_atlas %>%
  st_crop(st_bbox(crop_eu))
  
ind <- seq_len(nrow(crop_eu))
nb_points_chuncks <- 200
chuncks <- split(ind, ceiling(seq_along(ind)/nb_points_chuncks))

nb_cores <- availableCores()
plan(strategy = multisession(workers = 2))
correspondance <- furrr::future_map_dfr(
  chuncks[1:2],
  ~match_river_site(
    river = riv_atlas_loc,
    site = crop_eu[.x,],
    max_dist = 2000,
    metric_crs = NULL,
    return_snapped_pt = FALSE,
    crop = TRUE,
    buffer_dist = NULL,
    col_id_river = "FID",
    site_id = "siteid"
  )
)
tar_load(c(filtered_dataset, riveratlas_shp_files))
crop_eu <- filtered_dataset$location %>%
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326) %>%
  st_transform(crs =  4087) 
river_shp_filepath <- riveratlas_shp_files[str_detect(riveratlas_shp_files, "eu")]
 layer_name <- sf::st_layers(river_shp_filepath, do_count = TRUE)$name
  river <- sf::read_sf(river_shp_filepath, query = paste0("SELECT FID FROM ", layer_name)) %>%
    st_transform(crs = 4087)
riv_atlas_loc <- river %>%
  st_crop(st_bbox(crop_eu))
  
ind <- seq_len(nrow(crop_eu))
nb_points_chuncks <- 200
chuncks <- split(ind, ceiling(seq_along(ind)/nb_points_chuncks))
loc <- crop_eu %>%
  st_crop(st_bbox( riv_atlas_loc)) %>%
  arrange(main_bas) %>%
  slice(1:10)
ti <- match_river_site(
    river = riv_atlas_loc,
    site = loc,
    max_dist = 2000,
    metric_crs = NULL,
    return_snapped_pt = FALSE,
    crop = TRUE,
    buffer_dist = NULL,
    col_id_river = "FID",
    site_id = "siteid"
  )
library(mapview)

mapview(esp_riv_atlas) +
  mapview(nearest_snapped_pt %>%
    st_transform(crs = 54008),
  col.regions = "red") +
  esp

2.2 Water temperature

library(ncdf4)

tar_load(water_temperature_file)
wt_path <- R.utils::filePath(water_temperature_file, expandLinks = "any")
wt_nc <- nc_open(wt_path)
wt_nc
  • Get time variable:
ncatt_get(wt_nc, "time")

unit_time <- ncatt_get(wt_nc, "time", "units")$value
origin_time <- str_extract(unit_time, "[0-9]{4}-[0-9]{2}-[0-9]{2}")

time_var <- ncvar_get(wt_nc, "time")
time_var %>% tail + ymd(origin_time)
nc_close()
  • Load the file:
library(terra)
wt <- rast(wt_path)
terra::extract(wt, vect(loc[1,]))

2.2.1 Inspect values

tar_load(wt)
hist(wt$tmp)
summary(wt$tmp)
mask_abberant_tmp <- !is.na(wt$tmp) & wt$tmp >= 40
sum(mask_abberant_tmp, na.rm = TRUE)

summary(wt[!mask_abberant_tmp, ])
limit_abberant <- wt %>%
  filter(tmp <= 40) %>%
  mutate(month = month(date)) %>%
  nest_by(siteid, month) %>%
  summarise(
    mean_tmp = mean(data$tmp, na.rm = T),
    sd_tmp = sd(data$tmp, na.rm = T), 
  .groups = "drop"
  )
limit_abberant %>%
  filter(is.na(sd_tmp))
wt[wt$siteid %in%
  limit_abberant[is.na(limit_abberant$sd_tmp),]$siteid,
] %>%
  ggplot(aes(y = tmp, x = date, color = siteid)) +
  geom_line() +
  labs(x = "Date", y = "Temperature (°C)") +
  scale_y_log10()
wt_check <- wt %>%
  mutate(month = month(date)) %>%
  left_join(limit_abberant, by = c("siteid", "month")) %>%
  filter(!is.na(sd_tmp)) %>%
  filter(tmp <= 40) %>%
  mutate(check_tmp = abs(tmp - mean_tmp) >= 5 * sd_tmp)

summary(wt_check[!wt_check$check_tmp,])
sum(wt_check$check_tmp)

mask_site <- unique(wt_check[wt_check$check_tmp, ]$siteid)
ti <- wt_check[wt_check$siteid %in% mask_site,] 
unique(ti$siteid)
ti[ti$siteid %in% unique(ti$siteid)[5:10], ] %>%
  ggplot(aes(y = tmp, x = date, color = siteid)) +
  geom_line() +
  labs(x = "Date", y = "Temperature (°C)") +
  geom_point(
    data = ti[
      ti$siteid %in% unique(ti$siteid)[1:5] &
      ti$check_tmp,
      ], size =4)
wt_check %>%
  filter(!check_tmp) %>%
  select(siteid, date, tmp)

3 Exotic species

tar_load(c(world_site_sf, filtered_dataset))
loc <- world_site_sf$site
#https://github.com/ropensci/rfishbase
# check function validate names
# Missing basin: distance bw site and basin
# closest basin, same country
# NAS USGS
basin_tedesco <- read_sf(
  here("inst", "extdata", "Tedesco_2017", "Basin042017_3119.shp")
  ) %>%
  clean_names()
plot(st_geometry(basin_tedesco))
occ_exotic <- read_csv(
  here("inst", "extdata", "Tedesco_2017", "Occurrence_Table_12092019.csv"),
  col_types = list(`5.Fishbase.Species.Code` = col_character())) %>%
janitor::clean_names() %>%
rename_with(~str_remove(.x, "^x\\d_")) %>%
mutate(species = str_replace_all(fishbase_valid_species_name, "\\.", " "))
summary_exotic <- occ_exotic %>%
  group_by(basin_name, native_exotic_status) %>%
  summarise(nb_species = length(unique(species)), .groups = "drop")  %>%
  pivot_wider(names_from = "native_exotic_status", values_from = "nb_species") %>%
  mutate(across(where(is.integer), ~ifelse(is.na(.), 0, .))) %>%
  mutate(per_exo = exotic / (exotic + native))

summary_exotic %>%
  ggplot(aes(x = per_exo)) +
  geom_histogram()
library(rfishbase)
val_names_tedesco <- validate_names(unique(occ_exotic$species))
sum(is.na(val_names))

val_name_rivfishtime <- validate_names(unique(occ_exotic$species))
plot(st_geometry(world_site_sf$world))
plot(sample_n(loc, 50)["siteid"], add = T)
within <- st_within(loc, basin_tedesco)

exo_basin_site <- tibble(
  siteid = loc$siteid,
  basin_name = map(within, ~basin_tedesco[.x, ]$basin_name)) %>%
unnest(basin_name)
length(unique(exo_basin_site$siteid)) == nrow(exo_basin_site)
missed_sites_loc <- loc %>%
  filter(!siteid %in% exo_basin_site$siteid)
zoom_map <- st_bbox(missed_sites_loc)
basin_tedesco %>%
  ggplot() +
  geom_sf() +
  geom_sf(data = missed_sites_loc) +
  coord_sf(
    xlim = c(zoom_map["xmin"], zoom_map["xmax"]),
    ylim = c(zoom_map["ymin"], zoom_map["ymax"]))
m_sites_buf <- missed_sites_loc %>%
  st_buffer(dist = .5)
within <- st_nearest_points(basin_tedesco, missed_sites_loc)
exo_basin_m_site <- tibble(
  siteid = m_sites_buf$siteid,
  basin_name = map(within, ~basin_tedesco[.x, ]$basin_name)) %>%
unnest(basin_name)
world_site_sf$world %>%
  ggplot() +
  geom_sf() +
  geom_sf(data = loc %>%
  filter(!siteid %in% exo_basin_site$siteid)
  )
exo_sp_site <- exo_basin_site %>%
  left_join(occ_exotic %>%
    select(basin_name, species, native_exotic_status),
  by = "basin_name") %>%
select(-basin_name)

meas_exo <- filtered_dataset$measurement %>%
  left_join(exo_sp_site, by = c("siteid", "species"))

exo_sp_site %>%
  filter(str_detect(species, "Ang*"))

sum(is.na(meas_exo$native_exotic_status))
meas_exo %>%
  filter(is.na(native_exotic_status)) %>% 
  filter(siteid %in% exo_basin_site$siteid)
meas_exo %>%
  filter(is.na(native_exotic_status)) %>% 
  filter(siteid %in% exo_basin_site$siteid) %>%
  summarise(across(where(is.character), ~length(unique(.))))
abun_rich_exo <- meas_exo %>%
  group_by(op_id) %>%
  summarise(
    species_nb = length(unique(species)),
    total_abundance = sum(abundance),
    na_exo_abun = sum(abundance[is.na(native_exotic_status)]),
    na_exo_sp = length(unique(species[is.na(native_exotic_status)])),
    species_nb_nat = length(unique(species[native_exotic_status == "native"])),
    species_nb_exo = length(unique(species[native_exotic_status == "exotic"]))
  ) %>%
  mutate(
    perc_na_exo_abun = na_exo_abun / total_abundance,
    perc_na_exo_sp = na_exo_sp / species_nb,
    perc_exo_sp = species_nb_exo / species_nb 
  )
summary(abun_rich_exo)
abun_rich_exo %>%
  filter(perc_na_exo_abun >= 0.05)
abun_rich_exo %>%
  filter(!perc_na_exo_abun >= 0.05) %>%
  select(op_id, species_nb_nat, species_nb_exo,
    perc_na_exo_abun, perc_na_exo_sp, perc_exo_sp)
ti <- abun_rich_exo %>%
  left_join(select(filtered_dataset$abun_rich_op, op_id, siteid, year), by =
    "op_id"
  )

4 Misc

4.1 Reproducibility

Reproducibility receipt
## datetime
Sys.time()

## repository
if(requireNamespace('git2r', quietly = TRUE)) {
  git2r::repository()
} else {
  c(
    system2("git", args = c("log", "--name-status", "-1"), stdout = TRUE),
    system2("git", args = c("remote", "-v"), stdout = TRUE)
  )
}

## session info
sessionInfo()